Seahorse data analysis
1 Preparation
1.1 Required packages
1.2 Functions
- The functions are stored in a separate script
- The following functions were written:
- Timeline plot
- Density plot
- Barplot
source(here::here("Functions", "Functions.R"))
GGscatterPlot <- function(data, mapping, ..., method = "spearman") {
# Get correlation coefficient
x <- GGally::eval_data_col(data, mapping$x)
y <- GGally::eval_data_col(data, mapping$y)
cor <- cor(x, y, method = method)
# Assemble data frame
df <- data.frame(x = x, y = y)
# PCA
nonNull <- x != 0 & y != 0
dfpc <- prcomp(~x + y, df[nonNull, ])
df$cols <- predict(dfpc, df)[, 1]
# Define the direction of color range based on PC1 orientation:
dfsum <- x + y
colDirection <- ifelse(dfsum[which.max(df$cols)] < dfsum[which.min(df$cols)],
1, -1)
# Get 2D density for alpha
dens2D <- MASS::kde2d(df$x, df$y)
df$density <- fields::interp.surface(dens2D, df[, c("x", "y")])
if (any(df$density == 0)) {
mini2D = min(df$density[df$density != 0]) #smallest non zero value
df$density[df$density == 0] <- mini2D
}
# Prepare plot
pp <- ggplot(df, aes(x = x, y = y, color = cols, alpha = 1/density)) + ggplot2::geom_point(shape = 16,
show.legend = FALSE) + ggplot2::scale_color_viridis_c(direction = colDirection) +
# scale_color_gradient(low = '#0091ff', high = '#f0650e') +
ggplot2::scale_alpha(range = c(0.05, 0.6)) + ggplot2::geom_abline(intercept = 0,
slope = 1, col = "darkred") + ggplot2::geom_label(data = data.frame(xlabel = min(x,
na.rm = TRUE), ylabel = max(y, na.rm = TRUE), lab = round(cor, digits = 3)),
mapping = ggplot2::aes(x = xlabel, y = ylabel, label = lab), hjust = 0, vjust = 1,
size = 3, fontface = "bold", inherit.aes = FALSE # do not inherit anything from the ...
) +
theme_minimal()
return(pp)
}2 Data preparation
2.1 Load and combine data
- Data from Gabriel (Lifespan_data)
- Data from Seahorse analysis
- Merge (be careful that you merge the correct samples!)
- PPR measurements removed (they were used to calculate ATP production)
## Load Gabriels data table
data_Gab <- read_csv(here::here("Data", "Lifespan_Study_Data_updated.csv")) %>% filter(Treatment %in%
c("Control_21", "Control_3")) %>% filter(Cell_Line %in% c("hFB11", "hFB12", "hFB13")) %>%
filter(Date %in% c("03/07/2019", "03/21/2019", "04/04/2019", "04/18/2019")) %>%
mutate(Date_of_Experiment = case_when(Date == "03/07/2019" ~ "2019.03.08", Date ==
"03/21/2019" ~ "2019.03.22", Date == "04/04/2019" ~ "2019.04.05", Date ==
"04/18/2019" ~ "2019.04.19")) %>% dplyr::rename(Treatment_Oxygen = Treatment) %>%
mutate(Treatment = case_when(Treatments == "Control" ~ "Ctrl")) %>% select(-Treatments) %>%
droplevels()
## Remove columns that only contain NAs
data_Gab <- data_Gab %>% select_if(~!sum(is.na(.)) >= nrow(data_Gab))
## Load the Seahorse data
data_Seahorse <- list.files(here::here("Data"), pattern = "*Final.csv", full.names = T) %>%
lapply(read_csv) %>% bind_rows %>% separate(X1, into = c("Cell_Line", "Percent_Oxygen",
"Treatment")) %>% select(-contains("PPR")) %>% filter(Treatment %in% "Ctrl") %>%
droplevels()
data_Seahorse$Percent_Oxygen <- as.numeric(data_Seahorse$Percent_Oxygen)
## Finally combine both datasets
data_combined <- full_join(data_Gab, data_Seahorse, by = c("Cell_Line", "Date_of_Experiment",
"Percent_Oxygen", "Treatment")) %>% unite(Group, Cell_Line, Percent_Oxygen, remove = FALSE)
## Remove the percent sign from Telomere_Length_CV
data_combined$Telomere_Length_CV <- as.numeric(str_remove(data_combined$Telomere_Length_CV,
"%"))
data_combined$Cell_Line <- as.factor(data_combined$Cell_Line)
data_combined$Percent_Oxygen <- as.factor(data_combined$Percent_Oxygen)2.2 Check notes
- Gabriel has a column called “notes”
- Here we retrieve further information for the subset of samples we’re using
- From the notes column we see that hFB11 at 62 days grown was infected
- Will be excluded
test <- data_combined %>% select(Cell_Line, Percent_Oxygen, Days_Grown, Passage,
Notes)
test[which(!test$Notes == "NA"), ]# A tibble: 1 x 5
Cell_Line Percent_Oxygen Days_Grown Passage Notes
<fct> <fct> <dbl> <dbl> <chr>
1 hFB11 21 62 15 Infected
2.3 Remove empty columns without data
- Also remove RNA seq info
2.4 Choose independent and dependent variables
InDependent_vars <- c("Sample", "Cell_Line_Group", "Unique_Variable_Name", "Group",
"Cell_Line", "Cell_Type", "Sex", "Clinical_Condition", "Study_Part", "Replicate_Line",
"Treatment", "Treatment_Oxygen", "Percent_Oxygen", "Date", "Time", "Date_1",
"pre_study_doublings", "Cells_Plated", "Cells_Counted", "Days_Per_Passage", "basename",
"Date_of_Experiment", "Experiment_Name", "mitotic_age_per_day", "mitotic_age_per_doubling",
"Donor_Age", "Donor_Age_with_gestation", "Passage", "Days_Grown", "Intial_Mitotic_Age",
"pre_study_days_grown", "pre_study_doublings")
Dependent_vars <- colnames(data_combined)[!colnames(data_combined) %in% c(InDependent_vars)]
dta <- qpcR:::cbind.na(InDependent_vars, Dependent_vars)
dta[is.na(dta)] <- " "
knitr::kable(dta, caption = "")| InDependent_vars | Dependent_vars |
|---|---|
| Sample | Divisions_Per_Passage |
| Cell_Line_Group | Population_Doublings |
| Unique_Variable_Name | MiAge_Population_Doublings |
| Group | MiAge_Days_Grown |
| Cell_Line | Population_Doubling_Time |
| Cell_Type | Doubling_Rate |
| Sex | Cell_Size |
| Clinical_Condition | Cell_Volume |
| Study_Part | Percent_Dead |
| Replicate_Line | Telomere_Length |
| Treatment | Telomere_Length_CV |
| Treatment_Oxygen | IL6 |
| Percent_Oxygen | IL6_Normalized |
| Date | Copy_Number |
| Time | cf_mtDNA |
| Date_1 | cf_nDNA |
| pre_study_doublings | PanTissue_Clock |
| Cells_Plated | PanTissue_Normalized |
| Cells_Counted | Hannum_Clock |
| Days_Per_Passage | Skin_Blood_Clock |
| basename | PhenoAge_Clock |
| Date_of_Experiment | GrimAge_Clock |
| Experiment_Name | GrimAge_Normalized |
| mitotic_age_per_day | DNAmTL |
| mitotic_age_per_doubling | Mitotic_Age |
| Donor_Age | DunedinPoAm_Age |
| Donor_Age_with_gestation | Baseline_Respiration |
| Passage | NonMitochondrial_Respiration |
| Days_Grown | Basal_Respiration |
| Intial_Mitotic_Age | Max_Respiration |
| pre_study_days_grown | Spare_Respiration |
| pre_study_doublings | Proton_Leak |
| ATPlinked_Respiration | |
| Coupling_Efficiency | |
| Baseline_ECAR | |
| Max_ECAR | |
| Spare_ECAR | |
| ATPtotal | |
| ATPglyc | |
| ATPox | |
| ATPtotal_max | |
| ATPglyc_max | |
| ATPox_max | |
| ATPtotal_spare | |
| ATPox_spare | |
| ATPglyc_spare | |
| Resting_Metabolic_Rate | |
| Max_Metabolic_Rate | |
| PicoWatts | |
| Max_PicoWatts | |
| OCRcoupled |
2.5 Choose subgroups
- TBD
- I would like to divide the dataset into subgroups and compare for instance Mito vars with Age and time vars
Age_time_vars <- c("MiAge_Population_Doublings", "MiAge_Days_Grown", "Telomere_Length",
"Telomere_Length_CV", "PanTissue_Clock", "PanTissue_Normalized", "Hannum_Clock",
"Skin_Blood_Clock", "PhenoAge_Clock", "GrimAge_Clock", "GrimAge_Normalized",
"DNAmTL", "Mitotic_Age", "DunedinPoAm_Age")
Seahorse_vars <- c("Baseline_Respiration", "NonMitochondrial_Respiration", "Basal_Respiration",
"Max_Respiration", "Spare_Respiration", "Proton_Leak", "ATPlinked_Respiration",
"Coupling_Efficiency", "Baseline_ECAR", "Max_ECAR", "Spare_ECAR", "ATPtotal",
"ATPglyc", "ATPox", "ATPtotal_max", "ATPglyc_max", "ATPox_max", "ATPtotal_spare",
"ATPox_spare", "ATPglyc_spare", "Resting_Metabolic_Rate", "Max_Metabolic_Rate",
"PicoWatts", "Max_PicoWatts", "OCRcoupled")
Glyc_vars <- c("NonMitochondrial_Respiration", "Baseline_ECAR", "Max_ECAR", "Spare_ECAR",
"ATPglyc", "ATPtotal_max", "ATPglyc_max", "ATPglyc_spare", "ATPtotal_max", "ATPglyc_max",
"ATPox_max", "ATPtotal_spare", "ATPox_spare", "ATPglyc_spare")
Mito_vars <- c("OCRcoupled", "ATPox_max", "ATPox_spare", "ATPox", "Proton_Leak",
"Basal_Respiration", "Max_Respiration", "Spare_Respiration", "ATPlinked_Respiration",
"Coupling_Efficiency", "Baseline_Respiration", "Copy_Number", "cf_mtDNA", "cf_nDNA")
ATP_vars <- c("ATPlinked_Respiration", "ATPtotal", "ATPglyc", "ATPox")
Doubling_vars <- c("Divisions_Per_Passage", "Population_Doublings", "Population_Doubling_Time",
"Doubling_Rate")
Cell_vars <- c("Cell_Size", "Cell_Volume", "Percent_Dead", "Divisions_Per_Passage",
"Population_Doublings", "Population_Doubling_Time", "Doubling_Rate")
Immune_vars <- c("IL6", "IL6_Normalized", "cf_mtDNA", "cf_nDNA")
dta <- qpcR:::cbind.na(Age_time_vars, Seahorse_vars, Glyc_vars, Mito_vars, ATP_vars,
Doubling_vars, Cell_vars, Immune_vars)
dta[is.na(dta)] <- " "
knitr::kable(dta, caption = "")| Age_time_vars | Seahorse_vars | Glyc_vars | Mito_vars | ATP_vars | Doubling_vars | Cell_vars | Immune_vars |
|---|---|---|---|---|---|---|---|
| MiAge_Population_Doublings | Baseline_Respiration | NonMitochondrial_Respiration | OCRcoupled | ATPlinked_Respiration | Divisions_Per_Passage | Cell_Size | IL6 |
| MiAge_Days_Grown | NonMitochondrial_Respiration | Baseline_ECAR | ATPox_max | ATPtotal | Population_Doublings | Cell_Volume | IL6_Normalized |
| Telomere_Length | Basal_Respiration | Max_ECAR | ATPox_spare | ATPglyc | Population_Doubling_Time | Percent_Dead | cf_mtDNA |
| Telomere_Length_CV | Max_Respiration | Spare_ECAR | ATPox | ATPox | Doubling_Rate | Divisions_Per_Passage | cf_nDNA |
| PanTissue_Clock | Spare_Respiration | ATPglyc | Proton_Leak | Population_Doublings | |||
| PanTissue_Normalized | Proton_Leak | ATPtotal_max | Basal_Respiration | Population_Doubling_Time | |||
| Hannum_Clock | ATPlinked_Respiration | ATPglyc_max | Max_Respiration | Doubling_Rate | |||
| Skin_Blood_Clock | Coupling_Efficiency | ATPglyc_spare | Spare_Respiration | ||||
| PhenoAge_Clock | Baseline_ECAR | ATPtotal_max | ATPlinked_Respiration | ||||
| GrimAge_Clock | Max_ECAR | ATPglyc_max | Coupling_Efficiency | ||||
| GrimAge_Normalized | Spare_ECAR | ATPox_max | Baseline_Respiration | ||||
| DNAmTL | ATPtotal | ATPtotal_spare | Copy_Number | ||||
| Mitotic_Age | ATPglyc | ATPox_spare | cf_mtDNA | ||||
| DunedinPoAm_Age | ATPox | ATPglyc_spare | cf_nDNA | ||||
| ATPtotal_max | |||||||
| ATPglyc_max | |||||||
| ATPox_max | |||||||
| ATPtotal_spare | |||||||
| ATPox_spare | |||||||
| ATPglyc_spare | |||||||
| Resting_Metabolic_Rate | |||||||
| Max_Metabolic_Rate | |||||||
| PicoWatts | |||||||
| Max_PicoWatts | |||||||
| OCRcoupled |
3 Data normalization
3.1 Zscore transform
\[ Z = \frac{x-\mu}\sigma \] \[Z = standard\ score; \ x = observed\ value; \ \mu = mean\ of\ the\ sample; \ \sigma = standard\ deviation\ of\ the\ sample\] - The result should be a dataframe called “data_zscore”
3.2 Baseline normalization/Fold change
- At each time point, for each cell line, set control (21% oxygen) to 100%
data_long <- data_combined %>% pivot_longer(cols = paste(Dependent_vars), names_to = "feature",
values_to = "value")
data_FC <- data_long %>% filter(Percent_Oxygen %in% "21") %>% dplyr::rename(baseline = value) %>%
select(-c("Sample", "Cell_Line_Group", "Unique_Variable_Name", "Group", "Cell_Type",
"Sex", "Clinical_Condition", "Donor_Age", "Donor_Age_with_gestation", "Study_Part",
"Replicate_Line", "Percent_Oxygen", "Date", "Time", "Date_1", "Treatment_Oxygen",
"mitotic_age_per_day", "mitotic_age_per_doubling", "Intial_Mitotic_Age",
"pre_study_days_grown", "pre_study_doublings", "Cells_Plated", "Cells_Counted",
"Days_Per_Passage", "basename", "Experiment_Name")) %>% full_join(data_long,
by = c("Cell_Line", "Days_Grown", "Treatment", "Passage", "Date_of_Experiment",
"feature")) %>% mutate(Normalized = (value/baseline) * 100) %>% select(-c(baseline,
value)) %>% pivot_wider(names_from = "feature", values_from = "Normalized")
data_FC <- na_if(data_FC, Inf)
data_FC <- na_if(data_FC, -Inf)3.3 Log2 Fold change
- Note: Don’t normalize based on days grown, for some experiments this factor differs by one day. Better to base the normalization on date of experiment and passage
- The result should be a dataframe called “data_logFC”
data_long <- data_combined %>% pivot_longer(cols = paste(Dependent_vars), names_to = "feature",
values_to = "value") #%>%
# filter(Cell_Line %in% 'hFB12') %>% filter(Date_of_Experiment %in% '2019.04.19'
# ) %>% filter(feature %in% 'Telomere_Length')
data_logFC <- data_long %>% filter(Percent_Oxygen %in% "21") %>% dplyr::rename(baseline = value) %>%
select(-c("Sample", "Cell_Line_Group", "Unique_Variable_Name", "Group", "Cell_Type",
"Sex", "Clinical_Condition", "Donor_Age", "Donor_Age_with_gestation", "Study_Part",
"Replicate_Line", "Treatment_Oxygen", "Date", "Time", "Percent_Oxygen", "Date_1",
"mitotic_age_per_day", "mitotic_age_per_doubling", "Intial_Mitotic_Age",
"pre_study_days_grown", "pre_study_doublings", "Cells_Plated", "Cells_Counted",
"Days_Per_Passage", "basename", "Experiment_Name", "Days_Grown")) %>% full_join(data_long,
by = c("Cell_Line", "Treatment", "Passage", "Date_of_Experiment", "feature")) %>%
mutate(Normalized = log2(value/baseline)) %>% select(-c(baseline, value)) %>%
pivot_wider(names_from = "feature", values_from = "Normalized")
data_logFC <- na_if(data_logFC, Inf)
data_logFC <- na_if(data_logFC, -Inf)4 Timeline
5 Multivariate data visualization
5.1 Split data into hypoxia and normoxia data
- Only numeric variables
- This is used for heatmaps, correlation plots…
- A list of numerical variables is shown in results
nums <- unlist(lapply(data_combined, is.numeric))
nums_vars <- colnames(data_combined[, nums])
nums_vars <- nums_vars[!nums_vars %in% c("Sample", "Study_Part")]
print(nums_vars) [1] "Donor_Age" "Donor_Age_with_gestation"
[3] "Replicate_Line" "Passage"
[5] "Days_Grown" "mitotic_age_per_day"
[7] "mitotic_age_per_doubling" "Intial_Mitotic_Age"
[9] "pre_study_days_grown" "pre_study_doublings"
[11] "Cells_Plated" "Cells_Counted"
[13] "Days_Per_Passage" "Divisions_Per_Passage"
[15] "Population_Doublings" "MiAge_Population_Doublings"
[17] "MiAge_Days_Grown" "Population_Doubling_Time"
[19] "Doubling_Rate" "Cell_Size"
[21] "Cell_Volume" "Percent_Dead"
[23] "Telomere_Length" "Telomere_Length_CV"
[25] "IL6" "IL6_Normalized"
[27] "Copy_Number" "cf_mtDNA"
[29] "cf_nDNA" "PanTissue_Clock"
[31] "PanTissue_Normalized" "Hannum_Clock"
[33] "Skin_Blood_Clock" "PhenoAge_Clock"
[35] "GrimAge_Clock" "GrimAge_Normalized"
[37] "DNAmTL" "Mitotic_Age"
[39] "DunedinPoAm_Age" "Baseline_Respiration"
[41] "NonMitochondrial_Respiration" "Basal_Respiration"
[43] "Max_Respiration" "Spare_Respiration"
[45] "Proton_Leak" "ATPlinked_Respiration"
[47] "Coupling_Efficiency" "Baseline_ECAR"
[49] "Max_ECAR" "Spare_ECAR"
[51] "ATPtotal" "ATPglyc"
[53] "ATPox" "ATPtotal_max"
[55] "ATPglyc_max" "ATPox_max"
[57] "ATPtotal_spare" "ATPox_spare"
[59] "ATPglyc_spare" "Resting_Metabolic_Rate"
[61] "Max_Metabolic_Rate" "PicoWatts"
[63] "Max_PicoWatts" "OCRcoupled"
data_normoxia <- data_combined %>% filter(Percent_Oxygen %in% "21")
numdata_normoxia <- data_normoxia[, c(paste(nums_vars))] %>% select(-c("mitotic_age_per_day",
"mitotic_age_per_doubling", "Replicate_Line"))
data_hypoxia <- data_combined %>% filter(Percent_Oxygen %in% "3")
numdata_hypoxia <- data_hypoxia[, c(paste(nums_vars))] %>% select(-c("mitotic_age_per_day",
"mitotic_age_per_doubling", "Replicate_Line"))5.2 Define number of k-means cluster Normoxia data
set.seed(123)
data.scaled <- scale(numdata_normoxia[, Dependent_vars])
# Optimal number of clusters in the data
### Elbow method (look at the knee) Elbow method for kmeans
factoextra::fviz_nbclust(data.scaled, kmeans, method = "wss") + geom_vline(xintercept = 2,
linetype = 2) + labs(title = "Elbow method for kmeans Normoxia data")# Average silhouette for kmeans
factoextra::fviz_nbclust(data.scaled, kmeans, method = "silhouette") + labs(title = "Average silhouette for kmeans Normoxia data")### Gap statistic
library(cluster)
set.seed(123)
# Compute gap statistic for kmeans
gap_stat <- clusGap(data.scaled, FUN = kmeans, nstart = 25, K.max = 10, B = 500)
print(gap_stat, method = "firstmax")Clustering Gap statistic ["clusGap"] from call:
clusGap(x = data.scaled, FUNcluster = kmeans, K.max = 10, B = 500, nstart = 25)
B=500 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
--> Number of clusters (method 'firstmax'): 1
logW E.logW gap SE.sim
[1,] 3.1986802 3.1664514389 -0.0322288 0.07213914
[2,] 3.0082048 2.8687680735 -0.1394367 0.06897213
[3,] 2.8147803 2.6388288986 -0.1759514 0.07438674
[4,] 2.6146435 2.4183011802 -0.1963423 0.08097463
[5,] 2.4010371 2.1879073332 -0.2131297 0.08846667
[6,] 2.1750176 1.9350252633 -0.2399924 0.09525075
[7,] 1.8973493 1.6439618515 -0.2533875 0.10279490
[8,] 1.5680304 1.2835718921 -0.2844585 0.11646284
[9,] 1.1030711 0.7970592104 -0.3060118 0.13960774
[10,] 0.2321638 -0.0003292294 -0.2324930 0.19157945
# Gap statistic for hierarchical clustering
gap_stat <- clusGap(data.scaled, FUN = hcut, K.max = 10, B = 10)
fviz_gap_stat(gap_stat) + labs(title = "Gap statistics for hclust Normoxia data")5.3 Define number of k-means cluster Hypoxia data
set.seed(123)
data.scaled <- scale(numdata_hypoxia[, Dependent_vars])
# Optimal number of clusters in the data
### Elbow method (look at the knee) Elbow method for kmeans
factoextra::fviz_nbclust(data.scaled, kmeans, method = "wss") + geom_vline(xintercept = 2,
linetype = 2) + labs(title = "Elbow method for kmeans Hypoxia data")# Average silhouette for kmeans
factoextra::fviz_nbclust(data.scaled, kmeans, method = "silhouette") + labs(title = "Average silhouette for kmeans Hypoxia data")### Gap statistic
library(cluster)
set.seed(123)
# Compute gap statistic for kmeans
gap_stat <- clusGap(data.scaled, FUN = kmeans, nstart = 25, K.max = 10, B = 500)
print(gap_stat, method = "firstmax")Clustering Gap statistic ["clusGap"] from call:
clusGap(x = data.scaled, FUNcluster = kmeans, K.max = 10, B = 500, nstart = 25)
B=500 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
--> Number of clusters (method 'firstmax'): 1
logW E.logW gap SE.sim
[1,] 3.1925812 3.168813639 -0.02376758 0.07745762
[2,] 2.9604606 2.843781526 -0.11667906 0.06582318
[3,] 2.7828592 2.617196191 -0.16566302 0.07004801
[4,] 2.5710597 2.400967019 -0.17009264 0.07707509
[5,] 2.3383360 2.175045315 -0.16329067 0.08420521
[6,] 2.0798456 1.926005485 -0.15384007 0.09249840
[7,] 1.8180286 1.635827822 -0.18220076 0.10072828
[8,] 1.4788615 1.278847631 -0.20001391 0.11619768
[9,] 1.0082776 0.793714845 -0.21456280 0.13959036
[10,] 0.1310684 -0.003429018 -0.13449737 0.18814198
# Gap statistic for hierarchical clustering
gap_stat <- clusGap(data.scaled, FUN = hcut, K.max = 10, B = 10)
fviz_gap_stat(gap_stat) + labs(title = "Gap statistics for hclust Hypoxia data")5.4 Heatmap raw data
Scaled within each feature: - centering is done by subtracting the column means (omitting NAs) of x from their corresponding columns - scaling is done by dividing the (centered) columns of x by their standard deviations
dataHM <- data_combined %>%
unite(heatmapgp, Cell_Line, Percent_Oxygen, Days_Grown, sep="_", remove=F) %>%
column_to_rownames("heatmapgp")
dataHM <- dataHM[ c("hFB11_21_20", "hFB11_21_34", "hFB11_21_48",
"hFB11_3_20", "hFB11_3_34", "hFB11_3_48",
"hFB12_21_20", "hFB12_21_34", "hFB12_21_48", "hFB12_21_62",
"hFB12_3_20", "hFB12_3_34", "hFB12_3_48", "hFB12_3_63",
"hFB13_21_21", "hFB13_21_35","hFB13_21_49","hFB13_21_63",
"hFB13_3_21", "hFB13_3_35", "hFB13_3_49", "hFB13_3_63"
),]
dataHM$Percent_Oxygen <- as.character(dataHM$Percent_Oxygen)
#meta_data <- dataHM[,colnames(dataHM) %in% meta_names]
exprs <- dataHM[,colnames(dataHM) %in% Dependent_vars]
exprs <- t(scale(exprs))
column_ha <- HeatmapAnnotation(Donor = anno_block(gp = gpar(1:3), labels = c("Donor1", "Donor2", "Donor3")),
Days_grown = anno_barplot(as.numeric(dataHM$Days_Grown)),
Oxygen = dataHM$Percent_Oxygen,
col=list(Oxygen = c("21"= "steelblue3", "3"= "mistyrose3")))
ht <- Heatmap(exprs,
col=colorRampPalette(rev(c('#a50026','#d73027','#f46d43','#fdae61','#fee090','white','#e0f3f8','#abd9e9','#74add1','#4575b4','#313695')))(256),
#col=colorRampPalette(rev(brewer.pal(20, "RdYlBu")))(256),
top_annotation = column_ha,
show_column_names=F,
column_split = factor(dataHM$Cell_Line, levels = unique(dataHM$Cell_Line)),
row_names_gp = gpar(col = "black",fontsize = 6),
cluster_columns=FALSE)
ht = draw(ht)Figure 5.1: Figure: Here is a really important caption.
5.5 Heatmap with log2FC
dataHM <- data_logFC %>%
unite(heatmapgp, Cell_Line, Percent_Oxygen, Days_Grown, sep="_", remove=F) %>%
column_to_rownames("heatmapgp") %>%
filter(Percent_Oxygen %in% "3")
dataHM <- dataHM[ c("hFB11_3_20", "hFB11_3_34", "hFB11_3_48",
"hFB12_3_20", "hFB12_3_34", "hFB12_3_48", "hFB12_3_63",
"hFB13_3_21", "hFB13_3_35", "hFB13_3_49", "hFB13_3_63"
),]
dataHM$Percent_Oxygen <- as.character(dataHM$Percent_Oxygen)
#meta_data <- dataHM[,colnames(dataHM) %in% meta_names]
exprs <- dataHM[,colnames(dataHM) %in% Dependent_vars]
exprs <- t(exprs)
column_ha <- HeatmapAnnotation(Donor = anno_block(gp = gpar(1:3), labels = c("Donor1", "Donor2", "Donor3")),
Days_grown = anno_barplot(as.numeric(dataHM$Days_Grown))
)
ht <- Heatmap(exprs,
col=colorRampPalette(rev(c('#a50026','#d73027','#f46d43','#fdae61','#fee090','white','#e0f3f8','#abd9e9','#74add1','#4575b4','#313695')))(256),
#col=colorRampPalette(rev(brewer.pal(20, "RdYlBu")))(256),
top_annotation = column_ha,
show_column_names=F,
column_split = factor(dataHM$Cell_Line, levels = unique(dataHM$Cell_Line)),
row_names_gp = gpar(col = "black",fontsize = 6),
cluster_columns=FALSE)
ht = draw(ht)5.6 Higher or lower in hypoxia
Higher_in_hypoxia <- c("cf_nDNA", "ATPglyc", "Baseline_ECAR", "NonMitochondrial_Respiration",
"IL6_Normalized", "IL6", "cf_mtDNA")
Lower_in_hypoxia <- c("ATPtotal_spare", "Basal_Respiration", "ATPox", "ATPlinked_Respiration",
"OCRcoupled", "Max_Metabolic_Rate", "Max_Respiration", "ATPox_max", "Spare_Respiration",
"ATPox_spare", "Copy_Number", "Telomere_Length_CV")
dta <- qpcR:::cbind.na(Higher_in_hypoxia, Lower_in_hypoxia)
dta[is.na(dta)] <- " "
knitr::kable(dta, caption = "")| Higher_in_hypoxia | Lower_in_hypoxia |
|---|---|
| cf_nDNA | ATPtotal_spare |
| ATPglyc | Basal_Respiration |
| Baseline_ECAR | ATPox |
| NonMitochondrial_Respiration | ATPlinked_Respiration |
| IL6_Normalized | OCRcoupled |
| IL6 | Max_Metabolic_Rate |
| cf_mtDNA | Max_Respiration |
| ATPox_max | |
| Spare_Respiration | |
| ATPox_spare | |
| Copy_Number | |
| Telomere_Length_CV |
5.7 PCA
5.7.1 Days grown grouping
datapca <- data_combined[,c("Percent_Oxygen", "Cell_Line", paste(nums_vars) )] %>%
mutate(Days = rep("Days")) %>%
unite(Days_grown, Days_Grown, Days) %>%
droplevels()
pca_vars <- nums_vars[!nums_vars %in% c("Days_Grown", "Replicate_Line", "mitotic_age_per_day", "mitotic_age_per_doubling", "Intial_Mitotic_Age")]
datapca <- datapca %>% select(Percent_Oxygen, Cell_Line, Days_grown, paste(pca_vars))
pca.obj <- prcomp(datapca[,4:ncol(datapca)], center=TRUE, scale.=TRUE)
P <- ggbiplot(pca.obj,
obs.scale = 1,
var.scale=1,
ellipse=T,
circle=F,
varname.size=3,
var.axes=T,
groups=datapca$Days_grown, #no need for coloring, I'm making the points invisible
alpha=0) #invisible points, I add them below
P$layers <- c(geom_point(aes(color=datapca$Days_grown), cex=3), P$layers) #add geom_point in a layer underneath (only way I have to change the size of the points in ggbiplot)
P5.7.2 Cell Line grouping
datapca <- data_combined[,c("Percent_Oxygen", "Cell_Line", paste(nums_vars) )] %>%
mutate(Days = rep("Days")) %>%
unite(Days_grown, Days_Grown, Days) %>%
droplevels()
pca_vars <- nums_vars[!nums_vars %in% c("Days_Grown", "Replicate_Line", "mitotic_age_per_day", "mitotic_age_per_doubling", "Intial_Mitotic_Age")]
datapca <- datapca %>% select(Percent_Oxygen, Cell_Line, Days_grown, paste(pca_vars))
pca.obj <- prcomp(datapca[,4:ncol(datapca)], center=TRUE, scale.=TRUE)
P <- ggbiplot(pca.obj,
obs.scale = 1,
var.scale=1,
ellipse=T,
circle=F,
varname.size=3,
var.axes=T,
groups=datapca$Cell_Line, #no need for coloring, I'm making the points invisible
alpha=0) #invisible points, I add them below
P$layers <- c(geom_point(aes(color=datapca$Cell_Line), cex=3), P$layers) #add geom_point in a layer underneath (only way I have to change the size of the points in ggbiplot)
P5.7.3 Oxygen grouping
datapca <- data_combined[,c("Percent_Oxygen", "Cell_Line", paste(nums_vars) )] %>%
mutate(Days = rep("Days")) %>%
unite(Days_grown, Days_Grown, Days) %>%
droplevels()
pca_vars <- nums_vars[!nums_vars %in% c("Days_Grown", "Replicate_Line", "mitotic_age_per_day", "mitotic_age_per_doubling", "Intial_Mitotic_Age")]
datapca <- datapca %>% select(Percent_Oxygen, Cell_Line, Days_grown, paste(pca_vars))
pca.obj <- prcomp(datapca[,4:ncol(datapca)], center=TRUE, scale.=TRUE)
P <- ggbiplot(pca.obj,
obs.scale = 1,
var.scale=1,
ellipse=T,
circle=F,
varname.size=3,
var.axes=T,
groups=datapca$Percent_Oxygen, #no need for coloring, I'm making the points invisible
alpha=0) #invisible points, I add them below
P$layers <- c(geom_point(aes(color=datapca$Percent_Oxygen), cex=3), P$layers) #add geom_point in a layer underneath (only way I have to change the size of the points in ggbiplot)
P5.8 Corrplots
5.8.1 Option 1
## Corrplot side by side
par(mfrow = c(1, 2))
M <- cor(numdata_normoxia, method = "pearson", use = "complete.obs")
corrplot(M, method = "circle", tl.col = "black", tl.cex = 0.3, title = "21% Oxygen",
mar = c(0, 0, 4, 0))
P <- cor(numdata_hypoxia, method = "pearson", use = "complete.obs")
corrplot(P, method = "circle", tl.col = "black", tl.cex = 0.3, title = "3% Oxygen",
mar = c(0, 0, 4, 0))5.8.2 Option 2
- Normoxia data
## Similar heatmap with pvalues Compute correlation coefficients
test <- as.matrix(numdata_normoxia)
cor.coef <- cor(test)
# Compute correlation p-values
cor.test.p <- function(x) {
FUN <- function(x, y) cor.test(x, y)[["p.value"]]
z <- outer(colnames(x), colnames(x), Vectorize(function(i, j) FUN(x[, i], x[,
j])))
dimnames(z) <- list(colnames(x), colnames(x))
z
}
p <- cor.test.p(test)
# Create the heatmap
heatmaply_cor(cor.coef, k_col = 2, k_row = 2, node_type = "scatter", point_size_mat = -log10(p),
point_size_name = "-log10(p-value)", label_names = c("x", "y", "Correlation"),
fontsize_col = 6, fontsize_row = 6, main = "Normoxia")- Hypoxia data
## Similar heatmap with pvalues Compute correlation coefficients
test <- as.matrix(numdata_hypoxia)
cor.coef <- cor(test)
# Compute correlation p-values
cor.test.p <- function(x) {
FUN <- function(x, y) cor.test(x, y)[["p.value"]]
z <- outer(colnames(x), colnames(x), Vectorize(function(i, j) FUN(x[, i], x[,
j])))
dimnames(z) <- list(colnames(x), colnames(x))
z
}
p <- cor.test.p(test)
# Create the heatmap
heatmaply_cor(cor.coef, k_col = 2, k_row = 2, node_type = "scatter", point_size_mat = -log10(p),
point_size_name = "-log10(p-value)", label_names = c("x", "y", "Correlation"),
fontsize_col = 6, fontsize_row = 6, main = "Hypoxia")5.8.3 Option 3
ax <- list(title = "", zeroline = F, showline = FALSE, showticklabels = TRUE, showgrid = TRUE)
# Compute a correlation matrix
corr_normoxia <- round(cor(numdata_normoxia), 1)
# Compute a matrix of correlation p-values
p.mat_normoxia <- ggcorrplot::cor_pmat(numdata_normoxia)
# Visualize the lower triangle of the correlation matrix Barring the no
# significant coefficient
corr.plot_norm <- ggcorrplot::ggcorrplot(corr_normoxia, hc.order = F, type = "lower",
outline.col = "gray", p.mat = p.mat_normoxia, sig.level = 0.05, insig = "blank",
tl.cex = 6, legend.title = "Corr with p<0.001") + labs(title = "21% Oxygen") +
# theme(axis.text.x = element_text(margin=margin(-2,0,0,0))) +
coord_fixed(ratio = 0.5)
# corr.plot_norm
corr.plot_norm <- ggplotly(corr.plot_norm)
corr.plot_norm <- corr.plot_norm %>% plotly::layout(xaxis = ax, yaxis = ax)
corr.plot_norm# Compute a correlation matrix
corr_hypoxia <- round(cor(numdata_hypoxia), 1)
# Compute a matrix of correlation p-values
p.mat_hypoxia <- ggcorrplot::cor_pmat(numdata_hypoxia)
# Visualize the lower triangle of the correlation matrix Barring the no
# significant coefficient
corr.plot_hypoxia <- ggcorrplot::ggcorrplot(corr_hypoxia, hc.order = F, type = "lower",
outline.col = "gray", p.mat = p.mat_hypoxia, sig.level = 0.05, insig = "blank",
tl.cex = 6, legend.title = "Corr with p<0.001") + labs(title = "3% Oxygen") +
# theme(axis.text.x = element_text(margin=margin(-2,0,0,0))) +
coord_fixed(ratio = 0.5)
# corr.plot_norm
corr.plot_hypoxia <- ggplotly(corr.plot_hypoxia)
corr.plot_hypoxia <- corr.plot_hypoxia %>% plotly::layout(xaxis = ax, yaxis = ax)
corr.plot_hypoxia# #corr_normoxia[45:50,40:45] #corr_hypoxia[45:50,40:45] corr_normoxia[-0.5 <
# corr_normoxia & corr_normoxia < 0.5] <- NA corr_hypoxia[-0.5 < corr_hypoxia &
# corr_hypoxia < 0.5] <- NA test <- corr_normoxia - corr_hypoxia
# #test[45:50,40:45] test[-1 < test & test < 1] <- NA test[is.na(test)] <- 0
# heatmaply_cor( test, label_names = c('x', 'y', 'Difference 21% to 3%'),
# fontsize_col = 6, fontsize_row = 6, main='Hypoxia' )# library(Hmisc) # ++++++++++++++++++++++++++++ # flattenCorrMatrix #
# ++++++++++++++++++++++++++++ # cormat : matrix of the correlation coefficients
# # pmat : matrix of the correlation p-values flattenCorrMatrix <-
# function(cormat, pmat) { ut <- upper.tri(cormat) data.frame( row =
# rownames(cormat)[row(cormat)[ut]], column = rownames(cormat)[col(cormat)[ut]],
# cor =(cormat)[ut], p = pmat[ut] ) } res2<-rcorr(as.matrix(numdata_normoxia),
# type= 'pearson') sig_corr_norm <- flattenCorrMatrix(res2$r, res2$P)
# sig_corr_norm[which(sig_corr_norm$cor < 0.5 & sig_corr_norm$cor > -0.5), ] = NA
# sig_corr_norm[which(sig_corr_norm$p > 0.05), ] = NA sig_corr_norm <-
# na.omit(sig_corr_norm)%>% select(-c(cor, p))%>% as.data.frame() %>%
# rownames_to_column() res2<-rcorr(as.matrix(numdata_hypoxia), type= 'pearson')
# sig_corr_hypo <- flattenCorrMatrix(res2$r, res2$P)
# sig_corr_hypo[which(sig_corr_hypo$cor < 0.5 & sig_corr_hypo$cor > -0.5), ] = NA
# sig_corr_hypo[which(sig_corr_hypo$p > 0.05), ] = NA sig_corr_hypo <-
# na.omit(sig_corr_hypo) %>% select(-c(cor, p)) %>% as.data.frame() %>%
# rownames_to_column() corr_in_hypo <- anti_join(sig_corr_hypo, sig_corr_norm)
# cor_list <- unique(c(corr_in_hypo$row, corr_in_hypo$column ))
# sig_corr_hypo[!sig_corr_hypo[,'rowname'] %in% sig_corr_norm[,'rowname'],]
# #corr_in_normo <- sig_corr_norm[!sig_corr_norm[,'rowname'] %in%
# sig_corr_norm[,'rowname'],] cor_list <- unique(c(corr_in_hypo$row,
# corr_in_hypo$column#, #corr_in_normo$row, # corr_in_normo$column )) norm <-
# data.frame(row = c('mito', NA, NA), column = c('glyc', NA, NA))
# #%>%unite(group, row, column) hyp <- data.frame(row = c('mito', 'a', 'c'),
# column = c('glyc', 'b','d')) #%>%unite(group, row, column) full_join(norm, hyp)
# diffdf::diffdf(norm, hyp) hyp[which(norm$group %in% hyp$group)] anti_join(hyp,
# norm)5.9 Pairs
- Here I select only the features that are up or down in hypoxia
- The aim is to compare distributions, correlations etc
- Find features that are solely correlated in either condition
data_combined %>%
select(Percent_Oxygen, paste(Lower_in_hypoxia))%>%#, paste(Higher_in_hypoxia)) %>%
ggpairs(., mapping = ggplot2::aes(color=Percent_Oxygen),
lower = list(continuous = wrap("smooth", alpha = 0.3, size=0.1)),
upper = list(continuous = wrap("cor", size = 2))) +
theme_bw() +
theme(strip.placement = "outside",
text = element_text(size = 1),
strip.text.x = element_text(size = 3),
strip.text.y = element_text(size = 2))# p<- ggpairs(d, mapping = ggplot2::aes(color=Percent_Oxygen),
# lower = list(continuous = wrap("smooth", alpha = 0.3, size=0.1)),
# upper = list(continuous = wrap("cor", size = 2))) +
# theme(strip.placement = "outside", text = element_text(size = 5))
#ggplotly(p)
# pairs(subset, main = "test",
# pch = 21, bg = c("red", "blue")[unclass(subset$Percent_Oxygen)])
# library(GGally)
# d <- highlight_key(iris)
# p <- ggpairs(d, aes(colour = Species), columns = 1:5)
# ggplotly(p) %>%
# highlight("plotly_selected")df = read.table("https://raw.githubusercontent.com/clarewest/RFQAmodel/master/data/RFQAmodel_training.txt",
header = TRUE, stringsAsFactors = FALSE) %>% filter(Target %in% c("2OBA_B", "3HSB_D")) %>%
select(Target, EigenTHREADER, SAINT2, TMScore)
ggplot2::theme_set(ggplot2::theme_bw()) ## (this just globally sets the ggplot2 theme to theme_bw)
df %>% select(-Target) %>% ggpairs()df %>% ggpairs(., mapping = ggplot2::aes(colour = Target), lower = list(continuous = wrap("smooth",
alpha = 0.3, size = 0.1)))# data1<- data_combined %>% select(Percent_Oxygen, Days_Grown, Telomere_Length)
# %>% group_by(Percent_Oxygen, Days_Grown)%>% summarise_at(c('Telomere_Length'),
# mean) %>% # summarise(TL = mean(Telomere_Length, na.rm = TRUE)) %>%
# spread(Percent_Oxygen, Telomere_Length) data2 <- data_combined %>%
# select(Percent_Oxygen, Days_Grown, ATPox, ATPglyc, cf_nDNA, cf_mtDNA) %>%
# group_by(Percent_Oxygen, Days_Grown)%>% summarise_at(c('Telomere_Length',
# 'ATPox'), mean) summarise(TL = mean(Telomere_Length, na.rm = TRUE)) %>%
# spread(Percent_Oxygen, TL, ATPox, ATPglyc, cf_nDNA, cf_mtDNA)
# , ATPox, ATPglyc, cf_nDNA, cf_mtDNA)# df <- data_normoxia %>% select_if(is.numeric) %>% cor() %>% round(digits = 2)
# %>% as.data.frame() %>% filter_all(any_vars(abs(.) > 0.5)) df <- data_normoxia
# %>% select_if(is.numeric) %>% cor() %>% round(digits = 2) %>% as.data.frame()
# %>% select_if(funs(any(abs(.) > 0.5))) temp = cor(mtcars) temp[rowSums(temp >
# 0.4) > 0, colSums(temp > 0.4) > 0] df1 <- data_normoxia %>%
# select_if(is.numeric) %>% cor() %>% round(digits = 2) %>% as.data.frame() %>%
# rownames_to_column %>% gather(colname, value, -rowname) %>% filter(abs(value)
# >= 0.4)# nvars <- colnames(data_hypoxia) # To make things easy, the variable names are
# changed into x1 to xn newvarnames <- paste( 'x',1:nvars, sep='' ) # Add the new
# variable names to the matrices dimnames( cormat1 ) <- dimnames( cormat2 ) <-
# list( newvarnames, newvarnames ) # Here's the model model <- paste( 'X',
# 1:nvars, ' =~ ', 'x', 1:nvars, ';', sep='' ) # Fit the model fit <- sem( model,
# sample.cov = list( cormat1, cormat2 ), sample.nobs = c( nobs1, nobs2 ),
# group.equal = 'lv.covariances' ) # This is the outcome of the (Chi^2) test
# lavTestLRT( fit )[ 2, 5:7 ] sqrt(M^2) A <- sqrt(M^2)-sqrt(P^2) Heatmap(A)
# corrplot(A, method = 'circle', tl.col = 'black', tl.cex=0.3, title='21-3%
# Oxygen') library(ggplot2) library(plotly) library(reshape) library(Hmisc)
# library(viridis) x <- data_normoxia y <- as.matrix(x) r <- rcorr(y) m1 <-
# reshape::melt(r$r) m2 <- reshape::melt(r$P) p.value <- m2$value p <- ggplot(m1,
# aes(X1, X2, fill = value, label=p.value)) + geom_tile() + labs(x='',y='') +
# scale_fill_viridis() p ggplotly(p) ### ## # library(plotly) # trace1 <- list( #
# type = 'heatmap', # x = nums_vars, # y = nums_vars, # z = M # ) # data <-
# list(trace1) # layout <- list(title = 'Features Correlation Matrix') # p <-
# plot_ly() # p <- add_trace(p, type=trace1$type, x=trace1$x, y=trace1$y,
# z=trace1$z) # p <- layout(p, title=layout$title) # p # Compute correlation
# coefficients cor.coef <- cor(df) # Compute correlation p-values cor.test.p <-
# function(x){ FUN <- function(x, y) cor.test(x, y)[['p.value']] z <- outer(
# colnames(x), colnames(x), Vectorize(function(i,j) FUN(x[,i], x[,j])) )
# dimnames(z) <- list(colnames(x), colnames(x)) z } p <- cor.test.p(test) #
# Create the heatmap heatmaply_cor( cor.coef, node_type = 'scatter',
# point_size_mat = -log10(p), point_size_name = 'p-value', label_names = c('x',
# 'y', 'Correlation') )# index=sapply(iris, is.numeric) iris_numeric<-iris[index==TRUE] #p.mat <-
# cor.mtest(iris_numeric)$p p.mat <- corrr::correlate(iris_numeric)%>%as.matrix()
# iris_numeric%>% cor() %>% corrplot::corrplot( method = 'color',order =
# 'hclust',p.mat = p.mat, sig.level = 0.01 , pch.col = 'white',col = brewer.pal(n
# = 11, name = 'PuOr')) M <- cor(iris_numeric) corrplot.mixed(M,upper = 'color',
# tl.col = 'black') iris_numeric%>%cor()%>%corrplot::corrplot() library(ggplot2)
# library(plotly) library(reshape) library(Hmisc) library(viridis) x <-
# iris_numeric y <- as.matrix(x) r <- rcorr(y) m1 <- melt(r$r) m2 <- melt(r$P)
# p.value <- m2$value p <- ggplot(m1, aes(X1, X2, fill = value, label=p.value)) +
# geom_tile() + labs(x='',y='')+ scale_fill_viridis() ggplotly(p)5.10 Radar plots
6 Results summary
7 Other stuff
library("PerformanceAnalytics")
my_data <- mtcars[, c(1, 3, 4, 5, 6, 7)]
chart.Correlation(my_data, histogram = TRUE, pch = 19)ggpubr::ggscatter(data_combined, x = "Days_Grown", y = "Telomere_Length", color = "Cell_Line",
palette = "jco", add = "reg.line") + facet_wrap(~Cell_Line) + ggpubr::stat_cor(label.y = 4.4) +
ggpubr::stat_regline_equation(label.y = 4.2)Figure 7.1: Option 2: For detailed analysis
pairs(t(exprs[1:5, ]), main = "Anderson's Iris Data -- 3 species", pch = 21, bg = c("#1b9e77",
"#d95f02", "#7570b3")[unclass(data_combined$Cell_Line)])Figure 7.2: Option 1
library(WVPlots)
PairPlot(data_combined, colnames(data_combined)[18:21], "Anderson's Iris Data -- 3 species",
group_var = "Cell_Line")8 Notes Anna
8.2 The following code is used for two figures side by side
r out.width=c(‘50%’, ‘50%’), fig.show=‘hold’
8.3 Text side by side
Since R Markdown use the bootstrap framework under the hood. It is possible to benefit its powerful grid system. Basically, you can consider that your row is divided in 12 subunits of same width. You can then choose to use only a few of this subunits.
Here, I use 3 subunits of size 4 (4x3=12). The last column is used for a plot. You can read more about the grid system here. I got this result showing the following code in my R Markdown document.
8.4 Add tabs
8.4.1 Tab1
8.4.2 Tab2
8.5 Data tables
8.6 Highlight text
- This is my first conclusion
- This is my second conclusion
- This is my first conclusion
- This is my second conclusion
8.6.0.1 AF0951
8.7 Second way
mpg
Mazda RX4 21.0
Mazda RX4 Wag 21.0
Datsun 710 22.8
Hornet 4 Drive 21.4
Hornet Sportabout 18.7
8.8 Text formatting
R markdown allows to easily format your text. You can add links, write in bold or italic. This is very well explained in the Rstudio cheatsheet.
8.9 Skip a line
text
text
8.10 Interactive graphics
8.11 Feature ratio
- TBD (take from 2020-10-05)
- I